{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to DataFrames\n",
    "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Novemver 20, 2020**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using DataFrames # load package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "using CategoricalArrays # CategoricalArrays.jl is independent from DataFrames.jl but it is often used in combination"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with CategoricalArrays"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Constructor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4-element CategoricalArray{String,1,UInt32}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"C\""
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = categorical([\"A\", \"B\", \"B\", \"C\"]) # unordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4-element CategoricalArray{String,1,UInt32}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"C\""
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = categorical([\"A\", \"B\", \"B\", \"C\"], ordered=true) # ordered, by default order is sorting order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element CategoricalArray{Union{Missing, String},1,UInt32}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"C\"\n",
       " missing"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "z = categorical([\"A\",\"B\",\"B\",\"C\", missing]) # unordered with missings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10-element CategoricalArray{String,1,UInt32}:\n",
       " \"Q1: [1.0, 2.8)\"\n",
       " \"Q1: [1.0, 2.8)\"\n",
       " \"Q2: [2.8, 4.6)\"\n",
       " \"Q2: [2.8, 4.6)\"\n",
       " \"Q3: [4.6, 6.4)\"\n",
       " \"Q3: [4.6, 6.4)\"\n",
       " \"Q4: [6.4, 8.2)\"\n",
       " \"Q4: [6.4, 8.2)\"\n",
       " \"Q5: [8.2, 10.0]\"\n",
       " \"Q5: [8.2, 10.0]\""
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(we will cover grouping later, but let us here use it to analyze the results, we use Pipe.jl for chaining)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "using Pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x</th><th>nrow</th></tr><tr><th></th><th>Cat…</th><th>Int64</th></tr></thead><tbody><p>10 rows × 2 columns</p><tr><th>1</th><td>Q1: [-4.24439, -1.29147)</td><td>10000</td></tr><tr><th>2</th><td>Q2: [-1.29147, -0.842611)</td><td>10000</td></tr><tr><th>3</th><td>Q3: [-0.842611, -0.525183)</td><td>10000</td></tr><tr><th>4</th><td>Q4: [-0.525183, -0.256296)</td><td>10000</td></tr><tr><th>5</th><td>Q5: [-0.256296, -0.00347552)</td><td>10000</td></tr><tr><th>6</th><td>Q6: [-0.00347552, 0.248533)</td><td>10000</td></tr><tr><th>7</th><td>Q7: [0.248533, 0.525627)</td><td>10000</td></tr><tr><th>8</th><td>Q8: [0.525627, 0.840749)</td><td>10000</td></tr><tr><th>9</th><td>Q9: [0.840749, 1.27245)</td><td>10000</td></tr><tr><th>10</th><td>Q10: [1.27245, 4.37721]</td><td>10000</td></tr></tbody></table>"
      ],
      "text/latex": [
       "\\begin{tabular}{r|cc}\n",
       "\t& x & nrow\\\\\n",
       "\t\\hline\n",
       "\t& Cat… & Int64\\\\\n",
       "\t\\hline\n",
       "\t1 & Q1: [-4.24439, -1.29147) & 10000 \\\\\n",
       "\t2 & Q2: [-1.29147, -0.842611) & 10000 \\\\\n",
       "\t3 & Q3: [-0.842611, -0.525183) & 10000 \\\\\n",
       "\t4 & Q4: [-0.525183, -0.256296) & 10000 \\\\\n",
       "\t5 & Q5: [-0.256296, -0.00347552) & 10000 \\\\\n",
       "\t6 & Q6: [-0.00347552, 0.248533) & 10000 \\\\\n",
       "\t7 & Q7: [0.248533, 0.525627) & 10000 \\\\\n",
       "\t8 & Q8: [0.525627, 0.840749) & 10000 \\\\\n",
       "\t9 & Q9: [0.840749, 1.27245) & 10000 \\\\\n",
       "\t10 & Q10: [1.27245, 4.37721] & 10000 \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/plain": [
       "\u001b[1m10×2 DataFrame\u001b[0m\n",
       "\u001b[1m Row \u001b[0m│\u001b[1m x                            \u001b[0m\u001b[1m nrow  \u001b[0m\n",
       "\u001b[1m     \u001b[0m│\u001b[90m Cat…                         \u001b[0m\u001b[90m Int64 \u001b[0m\n",
       "─────┼─────────────────────────────────────\n",
       "   1 │ Q1: [-4.24439, -1.29147)      10000\n",
       "   2 │ Q2: [-1.29147, -0.842611)     10000\n",
       "   3 │ Q3: [-0.842611, -0.525183)    10000\n",
       "   4 │ Q4: [-0.525183, -0.256296)    10000\n",
       "   5 │ Q5: [-0.256296, -0.00347552)  10000\n",
       "   6 │ Q6: [-0.00347552, 0.248533)   10000\n",
       "   7 │ Q7: [0.248533, 0.525627)      10000\n",
       "   8 │ Q8: [0.525627, 0.840749)      10000\n",
       "   9 │ Q9: [0.840749, 1.27245)       10000\n",
       "  10 │ Q10: [1.27245, 4.37721]       10000"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "@pipe DataFrame(x=cut(randn(100000), 10)) |>\n",
    "      groupby(_, :x) |>\n",
    "      combine(_, nrow) # just to make sure cut works right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element CategoricalArray{Int64,1,UInt32}:\n",
       " 1\n",
       " 2\n",
       " 2\n",
       " 3\n",
       " 3"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v = categorical([1,2,2,3,3]) # contains integers not strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Union{Missing, String},1}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"C\"\n",
       " missing"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Managing levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{CategoricalArray{T,1,UInt32,V,C,U} where U where C where V where T,1}:\n",
       " CategoricalValue{String,UInt32}[\"A\", \"B\", \"B\", \"C\"]\n",
       " CategoricalValue{String,UInt32}[\"A\", \"B\", \"B\", \"C\"]\n",
       " Union{Missing, CategoricalValue{String,UInt32}}[\"A\", \"B\", \"B\", \"C\", missing]\n",
       " CategoricalValue{String,UInt32}[\"Q1: [1.0, 2.8)\", \"Q1: [1.0, 2.8)\", \"Q2: [2.8, 4.6)\", \"Q2: [2.8, 4.6)\", \"Q3: [4.6, 6.4)\", \"Q3: [4.6, 6.4)\", \"Q4: [6.4, 8.2)\", \"Q4: [6.4, 8.2)\", \"Q5: [8.2, 10.0]\", \"Q5: [8.2, 10.0]\"]\n",
       " CategoricalValue{Int64,UInt32}[1, 2, 2, 3, 3]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = [x,y,z,c,v]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element BitArray{1}:\n",
       " 0\n",
       " 1\n",
       " 0\n",
       " 1\n",
       " 0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isordered.(arr) # chcek if categorical array is orderd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(CategoricalValue{String,UInt32}[\"A\", \"B\", \"B\", \"C\"], true)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ordered!(x, true), isordered(x) # make x ordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(CategoricalValue{String,UInt32}[\"A\", \"B\", \"B\", \"C\"], false)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ordered!(x, false), isordered(x) # and unordered again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Array{T,1} where T,1}:\n",
       " [\"A\", \"B\", \"C\"]\n",
       " [\"A\", \"B\", \"C\"]\n",
       " [\"A\", \"B\", \"C\"]\n",
       " [\"Q1: [1.0, 2.8)\", \"Q2: [2.8, 4.6)\", \"Q3: [4.6, 6.4)\", \"Q4: [6.4, 8.2)\", \"Q5: [8.2, 10.0]\"]\n",
       " [1, 2, 3]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levels.(arr) # list levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Array{T,1} where T,1}:\n",
       " [\"A\", \"B\", \"C\"]\n",
       " [\"A\", \"B\", \"C\"]\n",
       " Union{Missing, String}[\"A\", \"B\", \"C\", missing]\n",
       " [\"Q1: [1.0, 2.8)\", \"Q2: [2.8, 4.6)\", \"Q3: [4.6, 6.4)\", \"Q4: [6.4, 8.2)\", \"Q5: [8.2, 10.0]\"]\n",
       " [1, 2, 3]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unique.(arr) # missing will be included"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[1] < y[2] # can compare as y is ordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this",
     "output_type": "error",
     "traceback": [
      "ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this",
      "",
      "Stacktrace:",
      " [1] <(::CategoricalValue{Int64,UInt32}, ::CategoricalValue{Int64,UInt32}) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\value.jl:144",
      " [2] top-level scope at In[18]:1",
      " [3] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "v[1] < v[2] # not comparable, v is unordered although it contains integers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[2] < \"A\" # comparison against type underlying categorical value is possible"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "KeyError: key \"Z\" not found",
     "output_type": "error",
     "traceback": [
      "KeyError: key \"Z\" not found",
      "",
      "Stacktrace:",
      " [1] getindex at .\\dict.jl:467 [inlined]",
      " [2] get at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\pool.jl:44 [inlined]",
      " [3] <(::CategoricalValue{String,UInt32}, ::String) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\value.jl:154",
      " [4] top-level scope at In[20]:1",
      " [5] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "y[2] < \"Z\" # but it is treated as a level, and thus only valid levels are allowed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4-element CategoricalArray{String,1,UInt32}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"C\""
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levels!(y, [\"C\", \"B\", \"A\"]) # you can reorder levels, mostly useful for ordered CategoricalArrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[1] < y[2] # observe that the order is changed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y[1] < \"B\" # level ordering is respected also when comparing with an underlying type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "ArgumentError: cannot remove level \"C\" as it is used at position 4 and allowmissing=false.",
     "output_type": "error",
     "traceback": [
      "ArgumentError: cannot remove level \"C\" as it is used at position 4 and allowmissing=false.",
      "",
      "Stacktrace:",
      " [1] levels!(::CategoricalArray{Union{Missing, String},1,UInt32,String,CategoricalValue{String,UInt32},Missing}, ::Array{String,1}; allowmissing::Bool, allow_missing::Nothing) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\array.jl:796",
      " [2] levels!(::CategoricalArray{Union{Missing, String},1,UInt32,String,CategoricalValue{String,UInt32},Missing}, ::Array{String,1}) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\array.jl:777",
      " [3] top-level scope at In[24]:1",
      " [4] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "levels!(z, [\"A\", \"B\"]) # you have to specify all levels that are present"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element CategoricalArray{Union{Missing, String},1,UInt32}:\n",
       " \"A\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " missing\n",
       " missing"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levels!(z, [\"A\", \"B\"], allowmissing=true) # unless the underlying array allows for missings and force removal of levels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element CategoricalArray{Union{Missing, String},1,UInt32}:\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " missing\n",
       " missing"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "z[1] = \"B\"\n",
    "z # now z has only \"B\" entries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2-element Array{String,1}:\n",
       " \"A\"\n",
       " \"B\""
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "levels(z) # but it remembers the levels it had (the reason is mostly performance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1-element Array{String,1}:\n",
       " \"B\""
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "droplevels!(z) # this way we can clean it up\n",
    "levels(z)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data manipulation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(CategoricalValue{String,UInt32}[\"A\", \"B\", \"B\", \"C\"], [\"A\", \"B\", \"C\"])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x, levels(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(CategoricalValue{String,UInt32}[\"A\", \"0\", \"B\", \"C\"], [\"A\", \"B\", \"C\", \"0\"])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[2] = \"0\"\n",
    "x, levels(x) # new level added at the end (works only for unordered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(CategoricalValue{Int64,UInt32}[1, 2, 2, 3, 3], [1, 2, 3])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "v, levels(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "MethodError: no method matching +(::CategoricalValue{Int64,UInt32}, ::CategoricalValue{Int64,UInt32})\nClosest candidates are:\n  +(::Any, ::Any, !Matched::Any, !Matched::Any...) at operators.jl:538",
     "output_type": "error",
     "traceback": [
      "MethodError: no method matching +(::CategoricalValue{Int64,UInt32}, ::CategoricalValue{Int64,UInt32})\nClosest candidates are:\n  +(::Any, ::Any, !Matched::Any, !Matched::Any...) at operators.jl:538",
      "",
      "Stacktrace:",
      " [1] top-level scope at In[32]:1",
      " [2] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "v[1] + v[2] # even though the underlying data is Int, we cannot operate on it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Int64,1}:\n",
       " 1\n",
       " 2\n",
       " 2\n",
       " 3\n",
       " 3"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get(v[1]) + get(v[2]) # or get a single value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Int64,1}:\n",
       " 1\n",
       " 2\n",
       " 2\n",
       " 3\n",
       " 3"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get.(v) # this will work for arrays witout missings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "MethodError: no method matching get(::Missing)\nClosest candidates are:\n  get(!Matched::REPL.Terminals.TTYTerminal, !Matched::Any, !Matched::Any) at C:\\buildbot\\worker\\package_win64\\build\\usr\\share\\julia\\stdlib\\v1.5\\REPL\\src\\Terminals.jl:161\n  get(!Matched::ZMQ.Context, !Matched::Integer) at D:\\AppData\\.julia\\packages\\ZMQ\\R3wSD\\src\\context.jl:113\n  get(!Matched::ZMQ.Message, !Matched::Integer) at D:\\AppData\\.julia\\packages\\ZMQ\\R3wSD\\src\\message.jl:157\n  ...",
     "output_type": "error",
     "traceback": [
      "MethodError: no method matching get(::Missing)\nClosest candidates are:\n  get(!Matched::REPL.Terminals.TTYTerminal, !Matched::Any, !Matched::Any) at C:\\buildbot\\worker\\package_win64\\build\\usr\\share\\julia\\stdlib\\v1.5\\REPL\\src\\Terminals.jl:161\n  get(!Matched::ZMQ.Context, !Matched::Integer) at D:\\AppData\\.julia\\packages\\ZMQ\\R3wSD\\src\\context.jl:113\n  get(!Matched::ZMQ.Message, !Matched::Integer) at D:\\AppData\\.julia\\packages\\ZMQ\\R3wSD\\src\\message.jl:157\n  ...",
      "",
      "Stacktrace:",
      " [1] _broadcast_getindex_evalf at .\\broadcast.jl:648 [inlined]",
      " [2] _broadcast_getindex at .\\broadcast.jl:621 [inlined]",
      " [3] getindex at .\\broadcast.jl:575 [inlined]",
      " [4] macro expansion at .\\broadcast.jl:932 [inlined]",
      " [5] macro expansion at .\\simdloop.jl:77 [inlined]",
      " [6] copyto! at .\\broadcast.jl:931 [inlined]",
      " [7] copyto! at .\\broadcast.jl:886 [inlined]",
      " [8] copy at .\\broadcast.jl:862 [inlined]",
      " [9] materialize(::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1},Nothing,typeof(get),Tuple{CategoricalArray{Union{Missing, String},1,UInt32,String,CategoricalValue{String,UInt32},Missing}}}) at .\\broadcast.jl:837",
      " [10] top-level scope at In[36]:1",
      " [11] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "get.(z) # but will fail on missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Union{Missing, String},1}:\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " missing\n",
       " missing"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "passmissing(get).(z) # you have to wrap it in passmissing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5-element Array{Union{Missing, String},1}:\n",
       " \"B\"\n",
       " \"B\"\n",
       " \"B\"\n",
       " missing\n",
       " missing"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Vector{Union{String, Missing}}(z) # or do the conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6-element Array{Union{Missing, Int64},1}:\n",
       " 10\n",
       "  2\n",
       "  3\n",
       "  4\n",
       "  5\n",
       "   missing"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6-element Array{Union{Missing, Int64, String},1}:\n",
       " 10\n",
       " 20\n",
       "   \"a\"\n",
       "   \"a\"\n",
       "   \"a\"\n",
       "   missing"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recode([1,2,3,4,5,missing], \"a\", 1=>10, 2=>20) # here we provided a default value for not mapped recodings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6-element Array{Union{Int64, String},1}:\n",
       " 10\n",
       "  2\n",
       "  3\n",
       "  4\n",
       "  5\n",
       "   \"missing\""
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recode([1,2,3,4,5,missing], 1=>10, missing=>\"missing\") # to recode Missing you have to do it explicitly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Union{Missing, CategoricalValue{Int64,UInt32}}[1, 2, 3, 4, 5, missing], [1, 2, 3, 4, 5])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = categorical([1:5; missing])\n",
    "t, levels(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Union{Missing, CategoricalValue{Int64,UInt32}}[2, 2, 2, 4, 5, missing], [2, 4, 5])"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recode!(t, [1,3]=>2)\n",
    "t, levels(t) # note that the levels are dropped after recode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3-element Array{Int64,1}:\n",
       "  3\n",
       "  0\n",
       " -1"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = categorical([1,2,3], ordered=true)\n",
    "levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3-element Array{Int64,1}:\n",
       " 100\n",
       " 200\n",
       " 300"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level\n",
    "levels(recode(t, 300, [1,2]=>100, 3=>200))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comparisons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4×4 Array{Bool,2}:\n",
       " 1  1  1  1\n",
       " 1  1  1  1\n",
       " 1  1  1  1\n",
       " 1  1  1  1"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = categorical([1,2,3])\n",
    "xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)]\n",
    "levels!(xs[2], [3,2,1])\n",
    "levels!(xs[4], [2,3,1])\n",
    "[a == b for a in xs, b in xs] # all are equal - comparison only by contents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4×4 Array{Bool,2}:\n",
       " 1  0  0  0\n",
       " 0  1  0  0\n",
       " 0  0  1  0\n",
       " 0  0  0  1"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray\n",
    "# all are different, notice that x[1] and x[2] are unordered but have a different order of levels\n",
    "[signature(a) == signature(b) for a in xs, b in xs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this",
     "output_type": "error",
     "traceback": [
      "ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this",
      "",
      "Stacktrace:",
      " [1] <(::CategoricalValue{Int64,UInt32}, ::CategoricalValue{Int64,UInt32}) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\value.jl:144",
      " [2] top-level scope at In[48]:1",
      " [3] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "x[1] < x[2] # you cannot compare elements of unordered CategoricalArray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t[1] < t[2] # but you can do it for an ordered one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "ArgumentError: CategoricalValue objects with different pools cannot be tested for order",
     "output_type": "error",
     "traceback": [
      "ArgumentError: CategoricalValue objects with different pools cannot be tested for order",
      "",
      "Stacktrace:",
      " [1] isless(::CategoricalValue{Int64,UInt32}, ::CategoricalValue{Int64,UInt32}) at D:\\AppData\\.julia\\packages\\CategoricalArrays\\ZjBSI\\src\\value.jl:131",
      " [2] top-level scope at In[51]:2",
      " [3] include_string(::Function, ::Module, ::String, ::String) at .\\loading.jl:1091"
     ]
    }
   ],
   "source": [
    "y = deepcopy(x) # but not across categorical arrays\n",
    "isless(x[1], y[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[1] == y[2] # equality tests works OK across CategoricalArrays"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Categorical columns in a DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x</th><th>y</th><th>z</th></tr><tr><th></th><th>Int64</th><th>Char</th><th>String</th></tr></thead><tbody><p>3 rows × 3 columns</p><tr><th>1</th><td>1</td><td>a</td><td>a</td></tr><tr><th>2</th><td>2</td><td>b</td><td>b</td></tr><tr><th>3</th><td>3</td><td>c</td><td>c</td></tr></tbody></table>"
      ],
      "text/latex": [
       "\\begin{tabular}{r|ccc}\n",
       "\t& x & y & z\\\\\n",
       "\t\\hline\n",
       "\t& Int64 & Char & String\\\\\n",
       "\t\\hline\n",
       "\t1 & 1 & a & a \\\\\n",
       "\t2 & 2 & b & b \\\\\n",
       "\t3 & 3 & c & c \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/plain": [
       "\u001b[1m3×3 DataFrame\u001b[0m\n",
       "\u001b[1m Row \u001b[0m│\u001b[1m x     \u001b[0m\u001b[1m y    \u001b[0m\u001b[1m z      \u001b[0m\n",
       "\u001b[1m     \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m String \u001b[0m\n",
       "─────┼─────────────────────\n",
       "   1 │     1  a     a\n",
       "   2 │     2  b     b\n",
       "   3 │     3  c     c"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = DataFrame(x = 1:3, y = 'a':'c', z = [\"a\",\"b\",\"c\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert all string columns to categorical in-place"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x</th><th>y</th><th>z</th></tr><tr><th></th><th>Int64</th><th>Char</th><th>Cat…</th></tr></thead><tbody><p>3 rows × 3 columns</p><tr><th>1</th><td>1</td><td>a</td><td>a</td></tr><tr><th>2</th><td>2</td><td>b</td><td>b</td></tr><tr><th>3</th><td>3</td><td>c</td><td>c</td></tr></tbody></table>"
      ],
      "text/latex": [
       "\\begin{tabular}{r|ccc}\n",
       "\t& x & y & z\\\\\n",
       "\t\\hline\n",
       "\t& Int64 & Char & Cat…\\\\\n",
       "\t\\hline\n",
       "\t1 & 1 & a & a \\\\\n",
       "\t2 & 2 & b & b \\\\\n",
       "\t3 & 3 & c & c \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/plain": [
       "\u001b[1m3×3 DataFrame\u001b[0m\n",
       "\u001b[1m Row \u001b[0m│\u001b[1m x     \u001b[0m\u001b[1m y    \u001b[0m\u001b[1m z    \u001b[0m\n",
       "\u001b[1m     \u001b[0m│\u001b[90m Int64 \u001b[0m\u001b[90m Char \u001b[0m\u001b[90m Cat… \u001b[0m\n",
       "─────┼───────────────────\n",
       "   1 │     1  a     a\n",
       "   2 │     2  b     b\n",
       "   3 │     3  c     c"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transform!(df, names(df, String) => categorical, renamecols=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>variable</th><th>mean</th><th>min</th><th>median</th><th>max</th><th>nmissing</th><th>eltype</th></tr><tr><th></th><th>Symbol</th><th>Union…</th><th>Any</th><th>Union…</th><th>Any</th><th>Int64</th><th>DataType</th></tr></thead><tbody><p>3 rows × 7 columns</p><tr><th>1</th><td>x</td><td>2.0</td><td>1</td><td>2.0</td><td>3</td><td>0</td><td>Int64</td></tr><tr><th>2</th><td>y</td><td></td><td>a</td><td></td><td>c</td><td>0</td><td>Char</td></tr><tr><th>3</th><td>z</td><td></td><td>a</td><td></td><td>c</td><td>0</td><td>CategoricalValue{String,UInt32}</td></tr></tbody></table>"
      ],
      "text/latex": [
       "\\begin{tabular}{r|ccccccc}\n",
       "\t& variable & mean & min & median & max & nmissing & eltype\\\\\n",
       "\t\\hline\n",
       "\t& Symbol & Union… & Any & Union… & Any & Int64 & DataType\\\\\n",
       "\t\\hline\n",
       "\t1 & x & 2.0 & 1 & 2.0 & 3 & 0 & Int64 \\\\\n",
       "\t2 & y &  & a &  & c & 0 & Char \\\\\n",
       "\t3 & z &  & a &  & c & 0 & CategoricalValue\\{String,UInt32\\} \\\\\n",
       "\\end{tabular}\n"
      ],
      "text/plain": [
       "\u001b[1m3×7 DataFrame\u001b[0m\n",
       "\u001b[1m Row \u001b[0m│\u001b[1m variable \u001b[0m\u001b[1m mean   \u001b[0m\u001b[1m min \u001b[0m\u001b[1m median \u001b[0m\u001b[1m max \u001b[0m\u001b[1m nmissing \u001b[0m\u001b[1m eltype                   \u001b[0m ⋯\n",
       "\u001b[1m     \u001b[0m│\u001b[90m Symbol   \u001b[0m\u001b[90m Union… \u001b[0m\u001b[90m Any \u001b[0m\u001b[90m Union… \u001b[0m\u001b[90m Any \u001b[0m\u001b[90m Int64    \u001b[0m\u001b[90m DataType                 \u001b[0m ⋯\n",
       "─────┼──────────────────────────────────────────────────────────────────────────\n",
       "   1 │ x         2.0     1    2.0     3           0  Int64                     ⋯\n",
       "   2 │ y        \u001b[90m        \u001b[0m a   \u001b[90m        \u001b[0m c           0  Char\n",
       "   3 │ z        \u001b[90m        \u001b[0m a   \u001b[90m        \u001b[0m c           0  CategoricalValue{String,U\n",
       "\u001b[31m                                                                1 column omitted\u001b[0m"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "describe(df)"
   ]
  }
 ],
 "metadata": {
  "@webio": {
   "lastCommId": null,
   "lastKernelId": null
  },
  "kernelspec": {
   "display_name": "Julia 1.5.3",
   "language": "julia",
   "name": "julia-1.5"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
